{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#! /bin/env python\n",
    "# -*- coding: utf-8 -*-\n",
    "\"\"\"\n",
    "预处理\n",
    "corpus其实已经非常好了，已经整理成[label, text]的形式\n",
    "我们需要做的就是，将text分词,去停用词\n",
    "\"\"\"\n",
    "from tools.tokenizer.wordCut import WordCut\n",
    "\n",
    "\n",
    "word_divider = WordCut()\n",
    "file_path = '/home/zhouchengyu/haiNan/textClassifier/data/cnews/cnews.val.txt'\n",
    "write_path = '/home/zhouchengyu/haiNan/textClassifier/data/cnews/val_token.txt'\n",
    "with open(write_path, 'w') as w:\n",
    "    with open(file_path, 'r') as f:\n",
    "        for line in f.readlines():\n",
    "            line = line.decode('utf-8').strip()\n",
    "            token_sen = word_divider.seg_sentence(line.split('\\t')[1])\n",
    "            w.write(line.split('\\t')[0].encode('utf-8') + '\\t' + token_sen.encode('utf-8') + '\\n')    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/zhouchengyu/haiNan/textClassifier/data/cnews/cnews.test.txt has been token and token_file_name is /home/zhouchengyu/haiNan/textClassifier/data/cnews/test_token.txt\n",
      "/home/zhouchengyu/haiNan/textClassifier/data/cnews/cnews.train.txt has been token and token_file_name is /home/zhouchengyu/haiNan/textClassifier/data/cnews/train_token.txt\n",
      "Sub-process(es) done.\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "val数据集最短...所有调通它的分词后\n",
    "我们接下来就要考虑，将train和test文件也分词处理\n",
    "保存各自的token文件到本地\n",
    "\n",
    "多进程处理\n",
    "\"\"\"\n",
    "import multiprocessing\n",
    "\n",
    "\n",
    "tmp_catalog = '/home/zhouchengyu/haiNan/textClassifier/data/cnews/'\n",
    "file_list = [tmp_catalog+'cnews.train.txt', tmp_catalog+'cnews.test.txt']\n",
    "write_list = [tmp_catalog+'train_token.txt', tmp_catalog+'test_token.txt']\n",
    "\n",
    "def tokenFile(file_path, write_path):\n",
    "    word_divider = WordCut()\n",
    "    with open(write_path, 'w') as w:\n",
    "        with open(file_path, 'r') as f:\n",
    "            for line in f.readlines():\n",
    "                line = line.decode('utf-8').strip()\n",
    "                token_sen = word_divider.seg_sentence(line.split('\\t')[1])\n",
    "                w.write(line.split('\\t')[0].encode('utf-8') + '\\t' + token_sen.encode('utf-8') + '\\n') \n",
    "    print file_path + ' has been token and token_file_name is ' + write_path\n",
    "\n",
    "pool = multiprocessing.Pool(processes=4)\n",
    "for file_path, write_path in zip(file_list, write_list):\n",
    "    pool.apply_async(tokenFile, (file_path, write_path, ))\n",
    "pool.close()\n",
    "pool.join() # 调用join()之前必须先调用close()\n",
    "print \"Sub-process(es) done.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5000\n"
     ]
    }
   ],
   "source": [
    "\"\"\"\n",
    "多进程速度还是不错的，但是跟木板效应一样，\n",
    "最终的执行总时间，还是跟最大的文件有关。\n",
    "\n",
    "\"\"\"\n",
    "\n",
    "def constructDataset(path):\n",
    "    \"\"\"\n",
    "    path: file path\n",
    "    rtype: lable_list and corpus_list\n",
    "    \"\"\"\n",
    "    label_list = []\n",
    "    corpus_list = []\n",
    "    with open(path, 'r') as p:\n",
    "        for line in p.readlines():\n",
    "            label_list.append(line.split('\\t')[0])\n",
    "            corpus_list.append(line.split('\\t')[1])\n",
    "    return label_list, corpus_list\n",
    "\n",
    "tmp_catalog = '/home/zhouchengyu/haiNan/textClassifier/data/cnews/'\n",
    "file_path = 'val_token.txt'\n",
    "val_label, val_set = constructDataset(tmp_catalog+file_path)\n",
    "print len(val_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "length of corpus is: 65000\n",
      "how many words: 379000\n",
      "tf-idf shape: (65000,379000)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "\n",
    "tmp_catalog = '/home/zhouchengyu/haiNan/textClassifier/data/cnews/'\n",
    "write_list = [tmp_catalog+'train_token.txt', tmp_catalog+'test_token.txt']\n",
    "\n",
    "tarin_label, train_set = constructDataset(write_list[0]) # 50000\n",
    "test_label, test_set = constructDataset(write_list[1]) # 10000\n",
    "# 计算tf-idf\n",
    "corpus_set = train_set + val_set + test_set # 全量计算tf-idf\n",
    "print \"length of corpus is: \" + str(len(corpus_set))\n",
    "vectorizer = CountVectorizer(min_df=1e-5) # drop df < 1e-5,去低频词\n",
    "transformer = TfidfTransformer()\n",
    "tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus_set))\n",
    "words = vectorizer.get_feature_names()\n",
    "print \"how many words: {0}\".format(len(words))\n",
    "print \"tf-idf shape: ({0},{1})\".format(tfidf.shape[0], tfidf.shape[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "上面有一个容易忽略的点：\n",
    "计算tf-idf需要将train, val, test三方面的数据集全部计算，这样提取到的特征才更加准确\n",
    "\"\"\"\n",
    "from sklearn import preprocessing\n",
    "\n",
    "# encode label\n",
    "corpus_label = tarin_label + val_label + test_label\n",
    "encoder = preprocessing.LabelEncoder()\n",
    "corpus_encode_label = encoder.fit_transform(corpus_label)\n",
    "train_label = corpus_encode_label[:50000]\n",
    "val_label = corpus_encode_label[50000:55000]\n",
    "test_label = corpus_encode_label[55000:]\n",
    "# get tf-idf dataset\n",
    "train_set = tfidf[:50000]\n",
    "val_set = tfidf[50000:55000]\n",
    "test_set = tfidf[55000:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mean accuracy: 0.9626\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       1.00      1.00      1.00      1000\n",
      "          1       0.99      0.98      0.98      1000\n",
      "          2       0.94      0.87      0.91      1000\n",
      "          3       0.91      0.91      0.91      1000\n",
      "          4       0.97      0.93      0.95      1000\n",
      "          5       0.97      0.98      0.98      1000\n",
      "          6       0.93      0.96      0.95      1000\n",
      "          7       0.99      0.97      0.98      1000\n",
      "          8       0.94      0.99      0.96      1000\n",
      "          9       0.95      0.99      0.97      1000\n",
      "\n",
      "avg / total       0.96      0.96      0.96     10000\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import classification_report\n",
    "# from sklearn.metrics import confusion_matrix\n",
    "\n",
    "# LogisticRegression classiy model\n",
    "lr_model = LogisticRegression()\n",
    "lr_model.fit(train_set, train_label)\n",
    "print \"val mean accuracy: {0}\".format(lr_model.score(val_set, val_label))\n",
    "y_pred = lr_model.predict(test_set)\n",
    "print classification_report(test_label, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "val mean accuracy: 0.9228\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       1.00      1.00      1.00      1000\n",
      "          1       0.98      0.98      0.98      1000\n",
      "          2       0.89      0.57      0.69      1000\n",
      "          3       0.81      0.97      0.88      1000\n",
      "          4       0.95      0.89      0.92      1000\n",
      "          5       0.97      0.96      0.97      1000\n",
      "          6       0.85      0.94      0.89      1000\n",
      "          7       0.95      0.97      0.96      1000\n",
      "          8       0.95      0.97      0.96      1000\n",
      "          9       0.91      0.99      0.95      1000\n",
      "\n",
      "avg / total       0.93      0.92      0.92     10000\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 随机森林分类器\n",
    "from sklearn.ensemble import RandomForestClassifier    \n",
    "\n",
    "\n",
    "rf_model = RandomForestClassifier(n_estimators=200, random_state=1080)\n",
    "rf_model.fit(train_set, train_label)\n",
    "print \"val mean accuracy: {0}\".format(rf_model.score(val_set, val_label))\n",
    "y_pred = rf_model.predict(test_set)\n",
    "print classification_report(test_label, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "上面采用逻辑回归分类器和随机森林分类器做对比：\n",
    "可以发现，除了个别分类随机森林方法有较大进步，大部分都差于逻辑回归分类器\n",
    "并且200棵树的随机森林耗时过长，比起逻辑回归分类器来说，运算效率太低\n",
    "\"\"\"\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
